{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "64974d33-d028-440c-86fa-1a0633b3d31d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Copyright 2021 NVIDIA Corporation. All Rights Reserved.\n",
    "#\n",
    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
    "# you may not use this file except in compliance with the License.\n",
    "# You may obtain a copy of the License at\n",
    "#\n",
    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
    "#\n",
    "# Unless required by applicable law or agreed to in writing, software\n",
    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
    "# See the License for the specific language governing permissions and\n",
    "# limitations under the License.\n",
    "# =============================================================================="
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c3f0ff46-9958-4d57-9067-a64be34e75da",
   "metadata": {},
   "source": [
    "<img src=\"http://developer.download.nvidia.com/compute/machine-learning/frameworks/nvidia_logo.png\" style=\"width: 90px; float: right;\">\n",
    "\n",
    "# GPT-2 Playground\n",
    "\n",
    "This notebook demonstrates the GPT-2 model for open-end text generation.\n",
    "\n",
    "The TensorRT HuggingFace GPT-2 model is a plug-in replacement for the original PyTorch  HuggingFace GPT-2 model.\n",
    "\n",
    "\n",
    "**Notes**: \n",
    " - For \"CPU - PyTorch\" and \"GPU - PyTorch\", a GPT-2 small model from HuggingFace model repository is employed. Inference is carried out with PyTorch in FP32 precision. All models run with batch size 1.\n",
    "Average run time across 5 runs is reported.\n",
    " - Prior to running this notebook, run [gpt2.ipynb](gpt2.ipynb) to download the GPT-2 model and generate the TensorRT engine."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3530e767-7050-4329-a4bc-e2221b9eb578",
   "metadata": {
    "jupyter": {
     "source_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[2021-10-18 00:51:18,099][OSS][INFO] Reading and loading engine file ./models/gpt2/trt-engine/gpt2.onnx.engine using trt native runner.\n",
      "[2021-10-18 00:51:22,581][OSS][DEBUG] Number of profiles detected in engine: 2\n",
      "[2021-10-18 00:51:22,585][OSS][DEBUG] Selected profile: [(1, 1), (1, 32), (1, 64)]\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import sys\n",
    "ROOT_DIR = os.path.abspath(\"../\")\n",
    "sys.path.append(ROOT_DIR)\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "import torch \n",
    "\n",
    "# huggingface\n",
    "from transformers import (\n",
    "    GPT2LMHeadModel,\n",
    "    GPT2Tokenizer,\n",
    "    GPT2Config,\n",
    ")\n",
    "\n",
    "from GPT2.trt import GPT2TRTDecoder, GPT2TRTEngine\n",
    "from NNDF.networks import NetworkMetadata, Precision\n",
    "from collections import namedtuple \n",
    "from GPT2.GPT2ModelConfig import GPT2ModelTRTConfig\n",
    "\n",
    "# download HuggingFace model and tokernizer\n",
    "GPT2_VARIANT = 'gpt2' # choices: gpt2 | gpt2-large\n",
    "model = GPT2LMHeadModel.from_pretrained(GPT2_VARIANT)\n",
    "config = GPT2Config(GPT2_VARIANT)\n",
    "tokenizer = GPT2Tokenizer.from_pretrained(GPT2_VARIANT)\n",
    "\n",
    "# load TensorRT engine\n",
    "metadata=NetworkMetadata(GPT2_VARIANT, Precision('fp16'), None)\n",
    "from os.path import exists\n",
    "if not exists('./models/gpt2/trt-engine/gpt2.onnx.engine'):\n",
    "    print(\"Error: TensorRT engine not found at ./models/gpt2/trt-engine/gpt2.onnx.engine. Please run gpt2.ipynb to generate the TensorRT engine first!\")\n",
    "else:\n",
    "    gpt2_engine = GPT2TRTEngine('./models/gpt2/trt-engine/gpt2.onnx.engine', metadata)\n",
    "    gpt2_trt = GPT2TRTDecoder(gpt2_engine, metadata, config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "766b8c94-ba8e-47c8-8624-57da462a0496",
   "metadata": {
    "jupyter": {
     "source_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f4971e06250e4ff78206e05b67a0f1d6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Textarea(value='TensorRT is a high performance deep learning inference platform that delivers low latency and …"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bd35da4869764703b4ae4495af4fdbc2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Textarea(value='...', description='GPT-2:', layout=Layout(width='auto'), placeholder='GPT-2 generated text', r…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6bd69c0458a8426f904a6672a1d885c0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "RadioButtons(description='Device:', options=('CPU - PyTorch', 'GPU - PyTorch', 'GPU - TensorRT'), value='CPU -…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "01dabe5d2c4340d3b163d6b680776755",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(Button(description='Generate', style=ButtonStyle()),), layout=Layout(align_items='center', disp…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "865124339ace4e0986b07f516bdac937",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "IntProgress(value=0, description='Progress:', layout=Layout(height='50px', width='100%'), max=6, style=Progres…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "975e84c56f1f43578abaf8260a38aee5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Output()"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import ipywidgets as widgets\n",
    "import numpy as np\n",
    "import time\n",
    "\n",
    "device = widgets.RadioButtons(\n",
    "    options=['CPU - PyTorch', \n",
    "             'GPU - PyTorch', \n",
    "             'GPU - TensorRT'],\n",
    "    description='Device:',\n",
    "    disabled=False\n",
    ")\n",
    "\n",
    "paragraph_text = widgets.Textarea(\n",
    "    value='TensorRT is a high performance deep learning inference platform that delivers low latency and high throughput for apps '\\\n",
    "'such as recommenders, speech and image/video on NVIDIA GPUs. ',\n",
    "    placeholder='Type something',\n",
    "    description='Context:',\n",
    "    disabled=False,\n",
    "    layout=widgets.Layout(width=\"auto\"),\n",
    "    rows=5,  \n",
    ")\n",
    "\n",
    "generated_text = widgets.Textarea(\n",
    "    value='...',\n",
    "    placeholder='GPT-2 generated text',\n",
    "    description='GPT-2:',\n",
    "    disabled=False,\n",
    "    layout=widgets.Layout(width=\"auto\"),\n",
    "    rows=5,\n",
    ")\n",
    "button = widgets.Button(description=\"Generate\")\n",
    "\n",
    "display(paragraph_text)\n",
    "display(generated_text)\n",
    "display(device)\n",
    "\n",
    "from IPython.display import display\n",
    "box_layout = widgets.Layout(display='flex',\n",
    "                flex_flow='column',\n",
    "                align_items='center',\n",
    "                width='100%')\n",
    "N_RUN = 6\n",
    "progress_bar = widgets.IntProgress(\n",
    "    value=0,\n",
    "    min=0,\n",
    "    max=N_RUN,\n",
    "    description='Progress:',\n",
    "    bar_style='', # 'success', 'info', 'warning', 'danger' or ''\n",
    "    style={'bar_color': 'green'},\n",
    "    orientation='horizontal', \n",
    "    layout=widgets.Layout(width='100%', height='50px')\n",
    ")\n",
    "\n",
    "box = widgets.HBox(children=[button],layout=box_layout)\n",
    "output = widgets.Output()\n",
    "display(box)\n",
    "display(progress_bar)\n",
    "display(output)\n",
    "\n",
    "def generate(b):\n",
    "    progress_bar.value = 0\n",
    "    inference_time_arr = []\n",
    "    with output:\n",
    "        if device.value == 'GPU - TensorRT':\n",
    "            inputs = tokenizer(paragraph_text.value, return_tensors=\"pt\")\n",
    "            for _ in range(N_RUN):\n",
    "                start_time = time.time()\n",
    "                sample_output = gpt2_trt.generate(inputs.input_ids.to('cuda'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])\n",
    "                inference_time_arr.append(time.time()-start_time)\n",
    "                progress_bar.value += 1\n",
    "\n",
    "            # de-tokenize model output to raw text\n",
    "            text = tokenizer.decode(sample_output[0], skip_special_tokens=True)\n",
    "            generated_text.value = text\n",
    "            print(\"GPU - TensorRT - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:])))                  \n",
    "                \n",
    "        elif device.value == 'CPU - PyTorch':\n",
    "            inputs = tokenizer(paragraph_text.value, return_tensors=\"pt\")\n",
    "            for _ in range(N_RUN):\n",
    "                start_time = time.time()\n",
    "                sample_output = model.to('cpu').generate(inputs.input_ids.to('cpu'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])\n",
    "                inference_time_arr.append(time.time()-start_time)\n",
    "                progress_bar.value += 1\n",
    "\n",
    "            # de-tokenize model output to raw text\n",
    "            text = tokenizer.decode(sample_output[0], skip_special_tokens=True)\n",
    "            generated_text.value = text\n",
    "            print(\"CPU - PyTorch - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:])))\n",
    "            \n",
    "        elif  device.value == 'GPU - PyTorch':  \n",
    "            inputs = tokenizer(paragraph_text.value, return_tensors=\"pt\")\n",
    "            for _ in range(N_RUN):\n",
    "                start_time = time.time()\n",
    "                sample_output = model.to('cuda').generate(inputs.input_ids.to('cuda'), max_length=GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[GPT2_VARIANT])\n",
    "                inference_time_arr.append(time.time()-start_time)\n",
    "                progress_bar.value += 1\n",
    "\n",
    "            # de-tokenize model output to raw text\n",
    "            text = tokenizer.decode(sample_output[0], skip_special_tokens=True)\n",
    "            generated_text.value = text\n",
    "            print(\"GPU - PyTorch - Average inference time: %.2f (ms)\"%(1000*np.mean(inference_time_arr[1:])))    \n",
    "            \n",
    "button.on_click(generate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58f473c0-6682-41af-8040-72f0a9472b0f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
